{
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7-final"
  },
  "orig_nbformat": 2,
  "kernelspec": {
   "name": "python3",
   "display_name": "DataAnalysis"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2,
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import nltk\n",
    "%matplotlib inline"
   ]
  },
  {
   "source": [
    "# Chap 3 处理原始文本\n",
    "1.  如何访问文件内的文本？\n",
    "2.  如何将文档分割成单独的单词和标点符号，从而进行文本语料上的分析？\n",
    "3.  如何产生格式化的输出，并把结果保存在文件中？"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "source": [
    "## 3.6 规范化文本(P115)"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "source": [
    "### 3.6.1 词干提取器\n",
    "词干提取器错误很多"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw = \"\"\"DENNIS: Listen, strange women lying in ponds distributing swords\n",
    " is no basis for a system of government.  \n",
    " Supreme executive power derives from a mandate from the masses, \n",
    " not from some farcical aquatic ceremony.\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "['DENNIS', ':', 'Listen', ',', 'strange', 'women', 'lying', 'in', 'ponds', 'distributing', 'swords', 'is', 'no', 'basis', 'for', 'a', 'system', 'of', 'government', '.', 'Supreme', 'executive', 'power', 'derives', 'from', 'a', 'mandate', 'from', 'the', 'masses', ',', 'not', 'from', 'some', 'farcical', 'aquatic', 'ceremony', '.']\n"
     ]
    }
   ],
   "source": [
    "tokens = nltk.word_tokenize(raw)\n",
    "print(tokens)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "['denni', ':', 'listen', ',', 'strang', 'women', 'lie', 'in', 'pond', 'distribut', 'sword', 'is', 'no', 'basi', 'for', 'a', 'system', 'of', 'govern', '.', 'suprem', 'execut', 'power', 'deriv', 'from', 'a', 'mandat', 'from', 'the', 'mass', ',', 'not', 'from', 'some', 'farcic', 'aquat', 'ceremoni', '.']\n"
     ]
    }
   ],
   "source": [
    "# 词干提取器 Porter 比 Lancaster 要好点\n",
    "# Porter 词干提取器\n",
    "porter = nltk.PorterStemmer()\n",
    "stem_porter_list = [\n",
    "        porter.stem(t)\n",
    "        for t in tokens\n",
    "]\n",
    "print(stem_porter_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "['den', ':', 'list', ',', 'strange', 'wom', 'lying', 'in', 'pond', 'distribut', 'sword', 'is', 'no', 'bas', 'for', 'a', 'system', 'of', 'govern', '.', 'suprem', 'execut', 'pow', 'der', 'from', 'a', 'mand', 'from', 'the', 'mass', ',', 'not', 'from', 'som', 'farc', 'aqu', 'ceremony', '.']\n"
     ]
    }
   ],
   "source": [
    "# Lancaster 词干提取器\n",
    "lancaster = nltk.LancasterStemmer()\n",
    "stem_lancaster_list = [\n",
    "        lancaster.stem(t)\n",
    "        for t in tokens\n",
    "]\n",
    "print(stem_lancaster_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Ex3-1 使用词干提取器索引文本\n",
    "class IndexedText(object):\n",
    "    def __init__(self, stemmer, text):\n",
    "        self._text = text\n",
    "        self._stemmer = stemmer\n",
    "        self._index = nltk.Index(\n",
    "                (self._stem(word), i)\n",
    "                for (i, word) in enumerate(text)\n",
    "        )\n",
    "\n",
    "    def concordance(self, word, width=40):\n",
    "        key = self._stem(word)\n",
    "        wc = int(width / 4)\n",
    "        for i in self._index[key]:\n",
    "            l_context = ' '.join(self._text[i - wc:i])\n",
    "            r_context = ' '.join(self._text[i:i + wc])\n",
    "            l_display = '{:>{width}}'.format(l_context[-width:], width=width)\n",
    "            r_display = '{:{width}}'.format(r_context[:width], width=width)\n",
    "            print(l_display, r_display)\n",
    "\n",
    "    def _stem(self, word):\n",
    "        return self._stemmer.stem(word).lower()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "alves of coconut and you ' re bangin ' ' em together . ARTHUR : So ? We have ridd\n  built it all the same , just to show ' em . It sank into the swamp . So ,      \n     OFFICER # 2 : Come on . Back with ' em . Back . Right . Come along . INSPECT\n"
     ]
    }
   ],
   "source": [
    "grail = nltk.corpus.webtext.words('grail.txt')\n",
    "text = IndexedText(porter, grail)\n",
    "text.concordance('em')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "alves of coconut and you ' re bangin ' ' em together . ARTHUR : So ? We have ridd\n  built it all the same , just to show ' em . It sank into the swamp . So ,      \n     OFFICER # 2 : Come on . Back with ' em . Back . Right . Come along . INSPECT\n"
     ]
    }
   ],
   "source": [
    "text = IndexedText(lancaster, grail)\n",
    "text.concordance('em')"
   ]
  },
  {
   "source": [
    "### 3.6.2 词形归并(P117)\n",
    "WordNet 词形归并器将会删除词缀产生的词，即将变化的单词恢复原始形式，例如：women 转变成 woman"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "['DENNIS', ':', 'Listen', ',', 'strange', 'woman', 'lying', 'in', 'pond', 'distributing', 'sword', 'is', 'no', 'basis', 'for', 'a', 'system', 'of', 'government', '.', 'Supreme', 'executive', 'power', 'derives', 'from', 'a', 'mandate', 'from', 'the', 'mass', ',', 'not', 'from', 'some', 'farcical', 'aquatic', 'ceremony', '.']\n"
     ]
    }
   ],
   "source": [
    "wnl = nltk.WordNetLemmatizer()\n",
    "wnl_lemma_list = [\n",
    "        wnl.lemmatize(t)\n",
    "        for t in tokens\n",
    "]\n",
    "print(wnl_lemma_list)"
   ]
  }
 ]
}